domain-specific-datasets-welcome

Sleeping

App Files Files Community

burtenshaw HF staff commited on Apr 22

Commit

839621c

•

1 Parent(s): 077efde

Upload 4 files

Browse files

Files changed (4) hide show

app.py +101 -0
defaults.py +7 -0
hub.py +50 -0
pages/🧑‍🌾 Domain Data Grower.py +15 -0

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import time
+from regex import F
+from defaults import (
+    DEFAULT_DOMAIN,
+)
+from hub import (
+    setup_dataset_on_hub,
+    duplicate_space_on_hub,
+    add_project_config_to_space_repo,
+)
+import streamlit as st
+st.set_page_config("Domain Data Grower", page_icon="🧑‍🌾")
+st.header("🧑‍🌾 Domain Data Grower")
+st.divider()
+################################################################################
+# APP MARKDOWN
+################################################################################
+st.header("🌱 Create a domain specific dataset")
+st.markdown(
+    """This space will set up your domain specific dataset project. It will
+create the resources that you need to build a dataset. Those resources include:
+- A dataset repository on the Hub
+- Another space to define expert domain and run generation pipelines
+For a complete overview of the project. Check out the README
+"""
+)
+st.page_link(
+    "pages/🧑‍🌾 Domain Data Grower.py",
+    label="Domain Data Grower",
+    icon="🧑‍🌾",
+)
+################################################################################
+# CONFIGURATION
+################################################################################
+st.subheader("🌾 Project Configuration")
+project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
+hub_username = st.text_input("Hub Username", "argilla")
+hub_token = st.text_input("Hub Token", type="password")
+private_selector = st.checkbox("Private Space", value=False)
+if st.button("🤗 Setup Project Resources"):
+    repo_id = f"{hub_username}/{project_name}"
+    setup_dataset_on_hub(
+        repo_id=repo_id,
+        hub_token=hub_token,
+    )
+    st.success(
+        f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}).  Hold on the repo_id: {repo_id}, we will need it in the next steps."
+    )
+    space_name = f"{project_name}_config_space"
+    duplicate_space_on_hub(
+        source_repo="argilla/domain-specific-datasets-template",
+        target_repo=space_name,
+        hub_token=hub_token,
+        private=private_selector,
+    )
+    st.success(
+        f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
+    )
+    argilla_name = f"{project_name}_argilla_space"
+    duplicate_space_on_hub(
+        source_repo="argilla/argilla-template-space",
+        target_repo=argilla_name,
+        hub_token=hub_token,
+        private=private_selector,
+    )
+    st.success(
+        f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
+    )
+    seconds = 5
+    with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
+        time.sleep(seconds)
+        add_project_config_to_space_repo(
+            dataset_repo_id=repo_id,
+            hub_token=hub_token,
+            project_name=project_name,
+            argilla_space_repo_id=f"{hub_username}/{argilla_name}",
+            project_space_repo_id=f"{hub_username}/{space_name}",
+        )

defaults.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import json
+SEED_DATA_PATH = "seed_data.json"
+with open(SEED_DATA_PATH) as f:
+    DEFAULT_DATA = json.load(f)
+DEFAULT_DOMAIN = DEFAULT_DATA["domain"]

hub.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import json
+from huggingface_hub import duplicate_space, HfApi
+hf_api = HfApi()
+def setup_dataset_on_hub(repo_id, hub_token):
+    # create an empty dataset repo on the hub
+    hf_api.create_repo(
+        repo_id=repo_id,
+        token=hub_token,
+        repo_type="dataset",
+    )
+def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False):
+    duplicate_space(
+        from_id=source_repo, to_id=target_repo, token=hub_token, private=private
+    )
+def add_project_config_to_space_repo(
+    dataset_repo_id,
+    hub_token,
+    project_name,
+    argilla_space_repo_id,
+    project_space_repo_id,
+):
+    #  upload the seed data and readme to the hub
+    with open("project_config.json", "w") as f:
+        json.dump(
+            {
+                "project_name": project_name,
+                "argilla_space_repo_id": argilla_space_repo_id,
+                "project_space_repo_id": project_space_repo_id,
+                "dataset_repo_id": dataset_repo_id,
+            },
+            f,
+        )
+    hf_api.upload_file(
+        path_or_fileobj="project_config.json",
+        path_in_repo="project_config.json",
+        token=hub_token,
+        repo_id=project_space_repo_id,
+        repo_type="space",
+    )

pages/🧑‍🌾 Domain Data Grower.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import streamlit as st
+import requests
+readme_location = "https://raw.githubusercontent.com/huggingface/data-is-better-together/4d7848149dcfe575b86517ca15e4aaa09dc9db74/domain-specific-datasets/README.md"
+def open_markdown_file(url):
+    response = requests.get(url)
+    return response.text
+readme = open_markdown_file(readme_location)
+st.markdown(readme)