gen-synth-data

Sleeping

App Files Files Community

Ben Burtenshaw commited on Apr 26, 2024

Commit

32014a1

•

1 Parent(s): 7055b44

lose codeless version

Browse files

Files changed (3) hide show

pages/2_👩🏼‍🔬 Describe Domain.py +20 -22
pages/3_🌱 Generate Dataset.py +51 -179
utils.py +24 -2

pages/2_👩🏼‍🔬 Describe Domain.py CHANGED Viewed

@@ -2,14 +2,9 @@ import json
 import streamlit as st
-from hub import push_dataset_to_hub
 from infer import query
 from defaults import (
-    DEFAULT_DOMAIN,
-    DEFAULT_PERSPECTIVES,
-    DEFAULT_TOPICS,
-    DEFAULT_EXAMPLES,
-    DEFAULT_SYSTEM_PROMPT,
     N_PERSPECTIVES,
     N_TOPICS,
     SEED_DATA_PATH,
@@ -18,12 +13,14 @@ from defaults import (
 )
 from utils import project_sidebar
 st.set_page_config(
     page_title="Domain Data Grower",
     page_icon="🧑‍🌾",
 )
 project_sidebar()
 ################################################################################
 # HEADER
 ################################################################################
@@ -37,6 +34,23 @@ st.write(
     "Define the project details, including the project name, domain, and API credentials"
 )
 ################################################################################
 # Domain Expert Section
 ################################################################################
@@ -212,22 +226,6 @@ with tab_raw_seed:
 st.divider()
-hub_username = DATASET_REPO_ID.split("/")[0]
-project_name = DATASET_REPO_ID.split("/")[1]
-st.write("Define the dataset repo details on the Hub")
-st.session_state["project_name"] = st.text_input("Project Name", project_name)
-st.session_state["hub_username"] = st.text_input("Hub Username", hub_username)
-st.session_state["hub_token"] = st.text_input("Hub Token", type="password", value=None)
-if all(
-    (
-        st.session_state.get("project_name"),
-        st.session_state.get("hub_username"),
-        st.session_state.get("hub_token"),
-    )
-):
-    st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
 if st.button("🤗 Push Dataset Seed") and all(
     (

 import streamlit as st
+from hub import push_dataset_to_hub, pull_seed_data_from_repo
 from infer import query
 from defaults import (
     N_PERSPECTIVES,
     N_TOPICS,
     SEED_DATA_PATH,
 )
 from utils import project_sidebar
 st.set_page_config(
     page_title="Domain Data Grower",
     page_icon="🧑‍🌾",
 )
 project_sidebar()
 ################################################################################
 # HEADER
 ################################################################################
     "Define the project details, including the project name, domain, and API credentials"
 )
+################################################################################
+# LOAD EXISTING DOMAIN DATA
+################################################################################
+DATASET_REPO_ID = (
+    f"{st.session_state['hub_username']}/{st.session_state['project_name']}"
+)
+SEED_DATA = pull_seed_data_from_repo(
+    DATASET_REPO_ID, hub_token=st.session_state["hub_token"]
+)
+DEFAULT_DOMAIN = SEED_DATA.get("domain", "")
+DEFAULT_PERSPECTIVES = SEED_DATA.get("perspectives", [""])
+DEFAULT_TOPICS = SEED_DATA.get("topics", [""])
+DEFAULT_EXAMPLES = SEED_DATA.get("examples", [{"question": "", "answer": ""}])
+DEFAULT_SYSTEM_PROMPT = SEED_DATA.get("domain_expert_prompt", "")
 ################################################################################
 # Domain Expert Section
 ################################################################################
 st.divider()
 if st.button("🤗 Push Dataset Seed") and all(
     (

pages/3_🌱 Generate Dataset.py CHANGED Viewed

@@ -1,18 +1,8 @@
 import streamlit as st
-from hub import pull_seed_data_from_repo, push_pipeline_to_hub
-from defaults import (
-    DEFAULT_SYSTEM_PROMPT,
-    PIPELINE_PATH,
-    PROJECT_NAME,
-    ARGILLA_URL,
-    HUB_USERNAME,
-    CODELESS_DISTILABEL,
-)
 from utils import project_sidebar
-from pipeline import serialize_pipeline, run_pipeline, create_pipelines_run_command
 st.set_page_config(
     page_title="Domain Data Grower",
     page_icon="🧑‍🌾",
@@ -27,20 +17,15 @@ project_sidebar()
 st.header("🧑‍🌾 Domain Data Grower")
 st.divider()
 st.subheader("Step 3. Run the pipeline to generate synthetic data")
-st.write("Define the project repos and models that the pipeline will use.")
-st.divider()
 ###############################################################
 # CONFIGURATION
 ###############################################################
-st.markdown("## Pipeline Configuration")
-st.markdown("#### 🤗 Hub details to pull the seed data")
-hub_username = st.text_input("Hub Username", HUB_USERNAME)
-project_name = st.text_input("Project Name", PROJECT_NAME)
-repo_id = f"{hub_username}/{project_name}"
-hub_token = st.text_input("Hub Token", type="password")
 st.divider()
@@ -89,169 +74,56 @@ st.divider()
 st.markdown("## Run the pipeline")
-st.write(
-    "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
 )
-if CODELESS_DISTILABEL:
-    st.write(
-        """We recommend running the pipeline locally if you're planning on generating a large dataset. \
-            But running the pipeline on this space is a handy way to get started quickly. Your synthetic
-            samples will be pushed to Argilla and available for review.
-            """
-    )
-    st.write(
-        """If you're planning on running the pipeline on the space, be aware that it \
-            will take some time to complete and you will need to maintain a \
-            connection to the space."""
-    )
-if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
-    if all(
-        [
-            argilla_api_key,
-            argilla_url,
-            base_url,
-            hub_username,
-            project_name,
-            hub_token,
-            argilla_dataset_name,
-        ]
-    ):
-        with st.spinner("Pulling seed data from the Hub..."):
-            try:
-                seed_data = pull_seed_data_from_repo(
-                    repo_id=f"{hub_username}/{project_name}",
-                    hub_token=hub_token,
-                )
-            except Exception:
-                st.error(
-                    "Seed data not found. Please make sure you pushed the data seed in Step 2."
-                )
-            domain = seed_data["domain"]
-            perspectives = seed_data["perspectives"]
-            topics = seed_data["topics"]
-            examples = seed_data["examples"]
-            domain_expert_prompt = seed_data["domain_expert_prompt"]
-        with st.spinner("Serializing the pipeline configuration..."):
-            serialize_pipeline(
-                argilla_api_key=argilla_api_key,
-                argilla_dataset_name=argilla_dataset_name,
-                argilla_api_url=argilla_url,
-                topics=topics,
-                perspectives=perspectives,
-                pipeline_config_path=PIPELINE_PATH,
-                domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
-                hub_token=hub_token,
-                endpoint_base_url=base_url,
-                examples=examples,
-            )
-            push_pipeline_to_hub(
-                pipeline_path=PIPELINE_PATH,
-                hub_token=hub_token,
-                hub_username=hub_username,
-                project_name=project_name,
-            )
-        st.success(f"Pipeline configuration saved to {hub_username}/{project_name}")
-        st.info(
-            "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
-        )
-        st.text(
-            "Execute the following command to generate a synthetic dataset from the seed data:"
-        )
-        command_to_run = create_pipelines_run_command(
-            hub_token=hub_token,
-            pipeline_config_path=PIPELINE_PATH,
-            argilla_dataset_name=argilla_dataset_name,
-            argilla_api_key=argilla_api_key,
-            argilla_api_url=argilla_url,
-        )
-        st.code(
-            f"""
-            pip install git+https://github.com/argilla-io/distilabel.git
-            git clone https://huggingface.co/datasets/{hub_username}/{project_name}
-            cd {project_name}
-            pip install -r requirements.txt
-            {' '.join(["python"] + command_to_run[1:])}
         """,
-            language="bash",
-        )
-        st.subheader(
-            "👩‍🚀 If you want to access the pipeline and manipulate the locally, you can do:"
-        )
-        st.code(
-            """
-            git clone https://github.com/huggingface/data-is-better-together
-            cd domain-specific-datasets
-            """
-        )
-    else:
-        st.error("Please fill all the required fields.")
-###############################################################
-# SPACE
-###############################################################
-if CODELESS_DISTILABEL:
-    if st.button("🔥 Run pipeline right here, right now!"):
-        if all(
-            [
-                argilla_api_key,
-                argilla_url,
-                base_url,
-                hub_username,
-                project_name,
-                hub_token,
-                argilla_dataset_name,
-            ]
-        ):
-            with st.spinner("Pulling seed data from the Hub..."):
-                try:
-                    seed_data = pull_seed_data_from_repo(
-                        repo_id=f"{hub_username}/{project_name}",
-                        hub_token=hub_token,
-                    )
-                except Exception as e:
-                    st.error(
-                        "Seed data not found. Please make sure you pushed the data seed in Step 2."
-                    )
-                domain = seed_data["domain"]
-                perspectives = seed_data["perspectives"]
-                topics = seed_data["topics"]
-                examples = seed_data["examples"]
-                domain_expert_prompt = seed_data["domain_expert_prompt"]
-                serialize_pipeline(
-                    argilla_api_key=argilla_api_key,
-                    argilla_dataset_name=argilla_dataset_name,
-                    argilla_api_url=argilla_url,
-                    topics=topics,
-                    perspectives=perspectives,
-                    pipeline_config_path=PIPELINE_PATH,
-                    domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
-                    hub_token=hub_token,
-                    endpoint_base_url=base_url,
-                    examples=examples,
-                )
-            with st.spinner("Starting the pipeline..."):
-                logs = run_pipeline(
-                    pipeline_config_path=PIPELINE_PATH,
-                    argilla_api_key=argilla_api_key,
-                    argilla_api_url=argilla_url,
-                    hub_token=hub_token,
-                    argilla_dataset_name=argilla_dataset_name,
-                )
-            st.success(f"Pipeline started successfully! 🚀")
-            with st.expander(label="View Logs", expanded=True):
-                for out in logs:
-                    st.text(out)
-        else:
-            st.error("Please fill all the required fields.")

 import streamlit as st
+from defaults import ARGILLA_URL
 from utils import project_sidebar
 st.set_page_config(
     page_title="Domain Data Grower",
     page_icon="🧑‍🌾",
 st.header("🧑‍🌾 Domain Data Grower")
 st.divider()
 st.subheader("Step 3. Run the pipeline to generate synthetic data")
+st.write("Define the distilabel pipeline for generating the dataset.")
 ###############################################################
 # CONFIGURATION
 ###############################################################
+hub_username = st.session_state.get("hub_username")
+project_name = st.session_state.get("project_name")
+hub_token = st.session_state.get("hub_token")
 st.divider()
 st.markdown("## Run the pipeline")
+st.markdown(
+    "Once you've defined the pipeline configuration above, you can run the pipeline from your local machine."
 )
+if all(
+    [
+        argilla_api_key,
+        argilla_url,
+        base_url,
+        hub_token,
+        project_name,
+        hub_token,
+        argilla_dataset_name,
+    ]
+):
+    st.markdown(
+        "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
+    )
+    st.code(
+        f"""
+        # Install the distilabel library
+        pip install git+https://github.com/argilla-io/distilabel.git
+        """
+    )
+    st.markdown("Next, you'll need to clone your dataset repo and run the pipeline:")
+    st.code(
+        f"""
+        # Clone the project and install the requirements
+        git clone https://huggingface.co/datasets/{hub_username}/{project_name}
+        cd {project_name}
+        pip install -r requirements.txt
+        # Run the pipeline
+        python pipeline.py
+            --argilla-api-key {argilla_api_key}
+            --argilla-api-url {argilla_url}
+            --argilla-dataset-name {argilla_dataset_name}
+            --endpoint-base-url {base_url}
+            --hub-token {st.session_state["hub_token"]}
         """,
+        language="bash",
+    )
+    st.markdown(
+        "👩‍🚀 If you want to customise the pipeline take a look in `pipeline.py` and teh [distilabel docs](https://distilabel.argilla.io/)"
+    )
+else:
+    st.info("Please fill all the required fields.")

utils.py CHANGED Viewed

@@ -26,8 +26,30 @@ def project_sidebar():
     )
     st.sidebar.link_button(f"📚 Dataset Repo", DATASET_URL)
     st.sidebar.link_button(f"🤖 Argilla Space", ARGILLA_URL)
-    st.sidebar.divider()
-    st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)
     st.sidebar.link_button(
         "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens"
     )

     )
     st.sidebar.link_button(f"📚 Dataset Repo", DATASET_URL)
     st.sidebar.link_button(f"🤖 Argilla Space", ARGILLA_URL)
+    hub_username = DATASET_REPO_ID.split("/")[0]
+    project_name = DATASET_REPO_ID.split("/")[1]
+    st.session_state["project_name"] = project_name
+    st.session_state["hub_username"] = hub_username
+    st.session_state["hub_token"] = st.sidebar.text_input(
+        "Hub Token", type="password", value=None
+    )
     st.sidebar.link_button(
         "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens"
     )
+    if all(
+        (
+            st.session_state.get("project_name"),
+            st.session_state.get("hub_username"),
+            st.session_state.get("hub_token"),
+        )
+    ):
+        st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
+    st.sidebar.divider()
+    st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)
+    if st.session_state["hub_token"] is None:
+        st.error("Please provide a Hub token to generate answers")
+        st.stop()