burtenshaw HF staff commited on
Commit
839621c
1 Parent(s): 077efde

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +101 -0
  2. defaults.py +7 -0
  3. hub.py +50 -0
  4. pages/🧑‍🌾 Domain Data Grower.py +15 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ from regex import F
4
+ from defaults import (
5
+ DEFAULT_DOMAIN,
6
+ )
7
+ from hub import (
8
+ setup_dataset_on_hub,
9
+ duplicate_space_on_hub,
10
+ add_project_config_to_space_repo,
11
+ )
12
+
13
+ import streamlit as st
14
+
15
+ st.set_page_config("Domain Data Grower", page_icon="🧑‍🌾")
16
+ st.header("🧑‍🌾 Domain Data Grower")
17
+ st.divider()
18
+
19
+ ################################################################################
20
+ # APP MARKDOWN
21
+ ################################################################################
22
+
23
+ st.header("🌱 Create a domain specific dataset")
24
+
25
+ st.markdown(
26
+ """This space will set up your domain specific dataset project. It will
27
+ create the resources that you need to build a dataset. Those resources include:
28
+
29
+ - A dataset repository on the Hub
30
+ - Another space to define expert domain and run generation pipelines
31
+
32
+ For a complete overview of the project. Check out the README
33
+ """
34
+ )
35
+
36
+ st.page_link(
37
+ "pages/🧑‍🌾 Domain Data Grower.py",
38
+ label="Domain Data Grower",
39
+ icon="🧑‍🌾",
40
+ )
41
+
42
+ ################################################################################
43
+ # CONFIGURATION
44
+ ################################################################################
45
+
46
+ st.subheader("🌾 Project Configuration")
47
+
48
+ project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
49
+ hub_username = st.text_input("Hub Username", "argilla")
50
+ hub_token = st.text_input("Hub Token", type="password")
51
+ private_selector = st.checkbox("Private Space", value=False)
52
+
53
+ if st.button("🤗 Setup Project Resources"):
54
+ repo_id = f"{hub_username}/{project_name}"
55
+
56
+ setup_dataset_on_hub(
57
+ repo_id=repo_id,
58
+ hub_token=hub_token,
59
+ )
60
+
61
+ st.success(
62
+ f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}). Hold on the repo_id: {repo_id}, we will need it in the next steps."
63
+ )
64
+
65
+ space_name = f"{project_name}_config_space"
66
+
67
+ duplicate_space_on_hub(
68
+ source_repo="argilla/domain-specific-datasets-template",
69
+ target_repo=space_name,
70
+ hub_token=hub_token,
71
+ private=private_selector,
72
+ )
73
+
74
+ st.success(
75
+ f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
76
+ )
77
+
78
+ argilla_name = f"{project_name}_argilla_space"
79
+
80
+ duplicate_space_on_hub(
81
+ source_repo="argilla/argilla-template-space",
82
+ target_repo=argilla_name,
83
+ hub_token=hub_token,
84
+ private=private_selector,
85
+ )
86
+
87
+ st.success(
88
+ f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
89
+ )
90
+
91
+ seconds = 5
92
+
93
+ with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
94
+ time.sleep(seconds)
95
+ add_project_config_to_space_repo(
96
+ dataset_repo_id=repo_id,
97
+ hub_token=hub_token,
98
+ project_name=project_name,
99
+ argilla_space_repo_id=f"{hub_username}/{argilla_name}",
100
+ project_space_repo_id=f"{hub_username}/{space_name}",
101
+ )
defaults.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ SEED_DATA_PATH = "seed_data.json"
4
+
5
+ with open(SEED_DATA_PATH) as f:
6
+ DEFAULT_DATA = json.load(f)
7
+ DEFAULT_DOMAIN = DEFAULT_DATA["domain"]
hub.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from huggingface_hub import duplicate_space, HfApi
4
+
5
+
6
+ hf_api = HfApi()
7
+
8
+
9
+ def setup_dataset_on_hub(repo_id, hub_token):
10
+ # create an empty dataset repo on the hub
11
+ hf_api.create_repo(
12
+ repo_id=repo_id,
13
+ token=hub_token,
14
+ repo_type="dataset",
15
+ )
16
+
17
+
18
+ def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False):
19
+ duplicate_space(
20
+ from_id=source_repo, to_id=target_repo, token=hub_token, private=private
21
+ )
22
+
23
+
24
+ def add_project_config_to_space_repo(
25
+ dataset_repo_id,
26
+ hub_token,
27
+ project_name,
28
+ argilla_space_repo_id,
29
+ project_space_repo_id,
30
+ ):
31
+ # upload the seed data and readme to the hub
32
+
33
+ with open("project_config.json", "w") as f:
34
+ json.dump(
35
+ {
36
+ "project_name": project_name,
37
+ "argilla_space_repo_id": argilla_space_repo_id,
38
+ "project_space_repo_id": project_space_repo_id,
39
+ "dataset_repo_id": dataset_repo_id,
40
+ },
41
+ f,
42
+ )
43
+
44
+ hf_api.upload_file(
45
+ path_or_fileobj="project_config.json",
46
+ path_in_repo="project_config.json",
47
+ token=hub_token,
48
+ repo_id=project_space_repo_id,
49
+ repo_type="space",
50
+ )
pages/🧑‍🌾 Domain Data Grower.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+
4
+
5
+ readme_location = "https://raw.githubusercontent.com/huggingface/data-is-better-together/4d7848149dcfe575b86517ca15e4aaa09dc9db74/domain-specific-datasets/README.md"
6
+
7
+
8
+ def open_markdown_file(url):
9
+ response = requests.get(url)
10
+ return response.text
11
+
12
+
13
+ readme = open_markdown_file(readme_location)
14
+
15
+ st.markdown(readme)