domain-specific-datasets-welcome

Sleeping

App Files Files Community

burtenshaw HF Staff commited on Apr 26, 2024

Commit

cdb761d

verified ·

1 Parent(s): f92d1a9

Upload 5 files

Browse files

Files changed (4) hide show

app.py +90 -66
hub.py +32 -98
project_config.json +1 -1
seed_data.json +2 -26

app.py CHANGED Viewed

@@ -1,94 +1,118 @@
-import streamlit as st
-from defaults import (
-    PROJECT_NAME,
-    ARGILLA_SPACE_REPO_ID,
-    DATASET_REPO_ID,
-    ARGILLA_URL,
-    PROJECT_SPACE_REPO_ID,
-    DIBT_PARENT_APP_URL,
 )
-from utils import project_sidebar
-st.set_page_config("Domain Data Grower", page_icon="🧑‍🌾")
-project_sidebar()
-if PROJECT_NAME == "DEFAULT_DOMAIN":
-    st.warning(
-        "Please set up the project configuration in the parent app before proceeding."
-    )
-    st.stop()
 st.header("🧑‍🌾 Domain Data Grower")
 st.divider()
-st.markdown(
-    """
-## 🌱 Create a dataset seed for aligning models to a specific domain
-This app helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
-Alignment datasets are used to fine-tune models to a specific domain or task, but as yet, there's a shortage of diverse datasets for this purpose.
-"""
-)
-st.markdown(
-    """
-## 🚜 How it works
-You can create a dataset seed by defining the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
-The dataset seed is then used to generate synthetic data for training a language model.
-"""
 )
-st.markdown(
-    """
-## 🗺️ The process
-### Step 1: ~~Setup the project~~
-~~Define the project details, including the project name, domain, and API credentials. Create Dataset Repo on the Hub.~~
-"""
-)
-st.link_button("🚀 ~~Setup Project via the parent app~~", DIBT_PARENT_APP_URL)
 st.markdown(
-    """
-### Step 2: Describe the Domain
-Define the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
-You can collaborate with domain experts to define the domain expertise and perspectives.
 """
 )
 st.page_link(
-    "pages/2_👩🏼‍🔬 Describe Domain.py",
-    label="Describe Domain",
-    icon="👩🏼‍🔬",
 )
-st.markdown(
-    """
-### Step 3: Generate Synthetic Data
-Use distilabel to generate synthetic data for your domain-specific dataset.
-You can run the pipeline locally or in this space to generate synthetic data.
-"""
-)
-st.page_link(
-    "pages/3_🌱 Generate Dataset.py",
-    label="Generate Dataset",
-    icon="🌱",
-)
-st.markdown(
-    """
-### Step 4: Review the Dataset
-Use Argilla to review the generated synthetic data and provide feedback on the quality of the data.
-"""
-)
-st.link_button("🔍 Review the dataset in Argilla", ARGILLA_URL)

+import time
+from hub import (
+    setup_dataset_on_hub,
+    duplicate_space_on_hub,
+    add_project_config_to_space_repo,
 )
+import streamlit as st
+# Constants
+# Written here to avoid defaults.py
+DEFAULT_DOMAIN = "farming"
+st.set_page_config(
+    "Domain Data Grower", page_icon="🧑‍🌾", initial_sidebar_state="collapsed"
+)
 st.header("🧑‍🌾 Domain Data Grower")
 st.divider()
+st.sidebar.link_button(
+    "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens"
 )
+################################################################################
+# APP MARKDOWN
+################################################################################
+st.header("🌱 Create a domain specific dataset")
 st.markdown(
+    """This space will set up your domain specific dataset project. It will
+create the resources that you need to build a dataset. Those resources include:
+- A dataset repository on the Hub
+- Another space to define expert domain and run generation pipelines
+For a complete overview of the project. Check out the README
 """
 )
 st.page_link(
+    "pages/🧑‍🌾 Domain Data Grower.py",
+    label="Domain Data Grower",
+    icon="🧑‍🌾",
 )
+################################################################################
+# CONFIGURATION
+################################################################################
+st.subheader("🌾 Project Configuration")
+project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
+hub_username = st.text_input("Hub Username", "argilla")
+hub_token = st.text_input("Hub Token", type="password")
+private_selector = st.checkbox("Private Space", value=False)
+if st.button("🤗 Setup Project Resources"):
+    repo_id = f"{hub_username}/{project_name}"
+    setup_dataset_on_hub(
+        repo_id=repo_id,
+        hub_token=hub_token,
+    )
+    st.success(
+        f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}).  Hold on the repo_id: {repo_id}, we will need it in the next steps."
+    )
+    space_name = f"{project_name}_config_space"
+    duplicate_space_on_hub(
+        source_repo="argilla/domain-specific-datasets-template",
+        target_repo=space_name,
+        hub_token=hub_token,
+        private=private_selector,
+    )
+    st.success(
+        f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
+    )
+    argilla_name = f"{project_name}_argilla_space"
+    duplicate_space_on_hub(
+        source_repo="argilla/argilla-template-space",
+        target_repo=argilla_name,
+        hub_token=hub_token,
+        private=private_selector,
+    )
+    st.success(
+        f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
+    )
+    seconds = 5
+    with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
+        time.sleep(seconds)
+        add_project_config_to_space_repo(
+            dataset_repo_id=repo_id,
+            hub_token=hub_token,
+            project_name=project_name,
+            argilla_space_repo_id=f"{hub_username}/{argilla_name}",
+            project_space_repo_id=f"{hub_username}/{space_name}",
+        )
+    st.subheader("👢 Next Steps")
+    st.write("Go to you project specific space!")
+    st.link_button(
+        "🧑‍🌾 Open Configuration Space",
+        f"https://huggingface.co/spaces/{hub_username}/{space_name}",
+    )

hub.py CHANGED Viewed

@@ -1,43 +1,10 @@
 import json
-from tempfile import mktemp
-import argilla as rg
-from huggingface_hub import HfApi
-from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH
 hf_api = HfApi()
-with open("DATASET_README_BASE.md") as f:
-    DATASET_README_BASE = f.read()
-def create_readme(domain_seed_data, project_name, domain):
-    # create a readme for the project that shows the domain and project name
-    readme = DATASET_README_BASE
-    readme += f"# {project_name}\n\n## Domain: {domain}"
-    perspectives = domain_seed_data.get("perspectives")
-    topics = domain_seed_data.get("topics")
-    examples = domain_seed_data.get("examples")
-    if perspectives:
-        readme += "\n\n## Perspectives\n\n"
-        for p in perspectives:
-            readme += f"- {p}\n"
-    if topics:
-        readme += "\n\n## Topics\n\n"
-        for t in topics:
-            readme += f"- {t}\n"
-    if examples:
-        readme += "\n\n## Examples\n\n"
-        for example in examples:
-            readme += f"### {example['question']}\n\n{example['answer']}\n\n"
-    temp_file = mktemp()
-    with open(temp_file, "w") as f:
-        f.write(readme)
-    return temp_file
 def setup_dataset_on_hub(repo_id, hub_token):
     # create an empty dataset repo on the hub
@@ -45,85 +12,52 @@ def setup_dataset_on_hub(repo_id, hub_token):
         repo_id=repo_id,
         token=hub_token,
         repo_type="dataset",
-        exist_ok=True,
     )
-def push_dataset_to_hub(
-    domain_seed_data_path,
-    project_name,
-    domain,
-    pipeline_path,
-    hub_username,
-    hub_token: str,
-):
-    repo_id = f"{hub_username}/{project_name}"
-    setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token)
-    #  upload the seed data and readme to the hub
     hf_api.upload_file(
-        path_or_fileobj=domain_seed_data_path,
         path_in_repo="seed_data.json",
-        token=hub_token,
         repo_id=repo_id,
         repo_type="dataset",
     )
-    # upload the readme to the hub
-    domain_seed_data = json.load(open(domain_seed_data_path))
-    hf_api.upload_file(
-        path_or_fileobj=create_readme(
-            domain_seed_data=domain_seed_data, project_name=project_name, domain=domain
-        ),
-        path_in_repo="README.md",
         token=hub_token,
-        repo_id=repo_id,
-        repo_type="dataset",
     )
-def push_pipeline_to_hub(
-    pipeline_path,
-    hub_username,
-    hub_token: str,
     project_name,
 ):
-    repo_id = f"{hub_username}/{project_name}"
-    # upload the pipeline to the hub
-    hf_api.upload_file(
-        path_or_fileobj=pipeline_path,
-        path_in_repo="pipeline.yaml",
-        token=hub_token,
-        repo_id=repo_id,
-        repo_type="dataset",
-    )
-    for code_path in REMOTE_CODE_PATHS:
-        hf_api.upload_file(
-            path_or_fileobj=code_path,
-            path_in_repo=code_path,
-            token=hub_token,
-            repo_id=repo_id,
-            repo_type="dataset",
         )
-    print(f"Dataset uploaded to {repo_id}")
-def pull_seed_data_from_repo(repo_id, hub_token):
-    # pull the dataset repo from the hub
-    hf_api.hf_hub_download(
-        repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH
     )
-    return json.load(open(SEED_DATA_PATH))
-def push_argilla_dataset_to_hub(
-    name: str, repo_id: str, url: str, api_key: str, workspace: str = "admin"
-):
-    rg.init(api_url=url, api_key=api_key)
-    feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
-    local_dataset = feedback_dataset.pull()
-    local_dataset.push_to_huggingface(repo_id=repo_id)

 import json
+from huggingface_hub import duplicate_space, HfApi
 hf_api = HfApi()
 def setup_dataset_on_hub(repo_id, hub_token):
     # create an empty dataset repo on the hub
         repo_id=repo_id,
         token=hub_token,
         repo_type="dataset",
     )
+    # upload the seed data
     hf_api.upload_file(
+        path_or_fileobj="seed_data.json",
         path_in_repo="seed_data.json",
         repo_id=repo_id,
         repo_type="dataset",
+        token=hub_token,
     )
+def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False):
+    duplicate_space(
+        from_id=source_repo,
+        to_id=target_repo,
         token=hub_token,
+        private=private,
+        exist_ok=True,
     )
+def add_project_config_to_space_repo(
+    dataset_repo_id,
+    hub_token,
     project_name,
+    argilla_space_repo_id,
+    project_space_repo_id,
 ):
+    #  upload the seed data and readme to the hub
+    with open("project_config.json", "w") as f:
+        json.dump(
+            {
+                "project_name": project_name,
+                "argilla_space_repo_id": argilla_space_repo_id,
+                "project_space_repo_id": project_space_repo_id,
+                "dataset_repo_id": dataset_repo_id,
+            },
+            f,
         )
+    hf_api.upload_file(
+        path_or_fileobj="project_config.json",
+        path_in_repo="project_config.json",
+        token=hub_token,
+        repo_id=project_space_repo_id,
+        repo_type="space",
     )

project_config.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"project_name": "~~domain_test_4~~", "argilla_space_repo_id": "~~burtenshaw~~/~~domain_test_4_argilla_space~~", "project_space_repo_id": "~~burtenshaw~~/~~domain_test_4_config_space~~", "dataset_repo_id": "~~burtenshaw~~/~~domain_test_4~~"}


1	+ {"project_name": "farming", "argilla_space_repo_id": "ignacioct/farming_argilla_space", "project_space_repo_id": "ignacioct/farming_config_space", "dataset_repo_id": "ignacioct/farming"}

seed_data.json CHANGED Viewed

@@ -1,39 +1,15 @@
 {
   "domain": "farming",
   "perspectives": [
-    "Family Farming",
-    "Agribusiness",
-    "Permaculture",
-    "Agroforestery",
-    "Conventional Farming"
   ],
   "topics": [
-    "animal welfare",
-    "economic growth",
-    "land",
-    "resources",
-    "efficiency"
   ],
   "examples": [
     {
       "question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
       "answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
-    },
-    {
-      "question": "Compare the environmental footprint of small-scale, local farming versus large-scale, industrial agriculture.",
-      "answer": "Industrial agriculture typically emphasizes high-output, monoculture farming reliant on synthetic fertilizers and pesticides, which, as Horrigan, Lawrence, and Walker (2002) argue, leads to greater greenhouse gas emissions, higher energy use, and more water consumption compared to small-scale farming. In contrast, small-scale farms often employ diverse cropping systems and lower chemical inputs, resulting in a smaller environmental footprint. Pimentel et al. (2005) note that small-scale farms tend to have higher yields per unit area when environmental and sustainability factors are integrated into farming practices."
-    },
-    {
-      "question": "Analyze the economic implications of transitioning from conventional to organic farming.",
-      "answer": "Transitioning from conventional to organic farming involves significant changes in farm management, input use, and market engagement. Crowder and Reganold (2015) present evidence that organic farms often yield smaller outputs initially but achieve higher profitability due to premium prices, lower input costs, and improved soil health over time. However, this transition requires upfront investments in knowledge and infrastructure, which can be economically challenging for some farmers, as noted by Seufert and Ramankutty (2017)."
-    },
-    {
-      "question": "Analyze the social, economic and environnmental impacts of land consolidation vs small-scale farmers.",
-      "answer": "Land consolidation has been associated with increased agricultural productivity but also with negative social and environmental impacts. Larger land holdings typically lead to monocultures, which reduce biodiversity and increase vulnerability to pests and diseases, as highlighted by Li et al. (2017). Economically, while consolidation can lead to economies of scale and potential gains in gross margins, it often displaces rural populations, exacerbating poverty and reducing local food diversity (Sutherland et al., 2015)."
-    },
-    {
-      "question": "Investigate the relationship between land ownership patterns, agricultural productivity and environment sustainability. ",
-      "answer": "Land ownership patterns critically influence agricultural productivity and sustainability. Secure land tenure supports investments in long-term improvements such as soil conservation and water management, which are pivotal for sustainable outcomes. Studies by Barrett et al. (2010) demonstrate that fragmented land ownership often results in inefficient resource use and higher transaction costs, which can detract from sustainability goals."
     }
   ],
   "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."

 {
   "domain": "farming",
   "perspectives": [
+    "Family Farming"
   ],
   "topics": [
+    "animal welfare"
   ],
   "examples": [
     {
       "question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
       "answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
     }
   ],
   "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."