File size: 3,100 Bytes
839621c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import time

from regex import F
from defaults import (
    DEFAULT_DOMAIN,
)
from hub import (
    setup_dataset_on_hub,
    duplicate_space_on_hub,
    add_project_config_to_space_repo,
)

import streamlit as st

st.set_page_config("Domain Data Grower", page_icon="πŸ§‘β€πŸŒΎ")
st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
st.divider()

################################################################################
# APP MARKDOWN
################################################################################

st.header("🌱 Create a domain specific dataset")

st.markdown(
    """This space will set up your domain specific dataset project. It will 
create the resources that you need to build a dataset. Those resources include: 
    
- A dataset repository on the Hub
- Another space to define expert domain and run generation pipelines    

For a complete overview of the project. Check out the README 
"""
)

st.page_link(
    "pages/πŸ§‘β€πŸŒΎ Domain Data Grower.py",
    label="Domain Data Grower",
    icon="πŸ§‘β€πŸŒΎ",
)

################################################################################
# CONFIGURATION
################################################################################

st.subheader("🌾 Project Configuration")

project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
hub_username = st.text_input("Hub Username", "argilla")
hub_token = st.text_input("Hub Token", type="password")
private_selector = st.checkbox("Private Space", value=False)

if st.button("πŸ€— Setup Project Resources"):
    repo_id = f"{hub_username}/{project_name}"

    setup_dataset_on_hub(
        repo_id=repo_id,
        hub_token=hub_token,
    )

    st.success(
        f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}).  Hold on the repo_id: {repo_id}, we will need it in the next steps."
    )

    space_name = f"{project_name}_config_space"

    duplicate_space_on_hub(
        source_repo="argilla/domain-specific-datasets-template",
        target_repo=space_name,
        hub_token=hub_token,
        private=private_selector,
    )

    st.success(
        f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
    )

    argilla_name = f"{project_name}_argilla_space"

    duplicate_space_on_hub(
        source_repo="argilla/argilla-template-space",
        target_repo=argilla_name,
        hub_token=hub_token,
        private=private_selector,
    )

    st.success(
        f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
    )

    seconds = 5

    with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
        time.sleep(seconds)
        add_project_config_to_space_repo(
            dataset_repo_id=repo_id,
            hub_token=hub_token,
            project_name=project_name,
            argilla_space_repo_id=f"{hub_username}/{argilla_name}",
            project_space_repo_id=f"{hub_username}/{space_name}",
        )