Ben Burtenshaw commited on
Commit
5776d7d
β€’
1 Parent(s): 7a96aaa

update for local pipeline

Browse files
parent_app/__pycache__/defaults.cpython-311.pyc DELETED
Binary file (561 Bytes)
 
parent_app/__pycache__/hub.cpython-311.pyc DELETED
Binary file (1.7 kB)
 
parent_app/app.py DELETED
@@ -1,118 +0,0 @@
1
- import time
2
-
3
- from hub import (
4
- setup_dataset_on_hub,
5
- duplicate_space_on_hub,
6
- add_project_config_to_space_repo,
7
- )
8
-
9
- import streamlit as st
10
-
11
-
12
- # Constants
13
- # Written here to avoid defaults.py
14
- DEFAULT_DOMAIN = "farming"
15
-
16
- st.set_page_config(
17
- "Domain Data Grower", page_icon="πŸ§‘β€πŸŒΎ", initial_sidebar_state="collapsed"
18
- )
19
-
20
- st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
21
- st.divider()
22
-
23
- st.sidebar.link_button(
24
- "πŸ€— Get your Hub Token", "https://huggingface.co/settings/tokens"
25
- )
26
-
27
- ################################################################################
28
- # APP MARKDOWN
29
- ################################################################################
30
-
31
- st.header("🌱 Create a domain specific dataset")
32
-
33
- st.markdown(
34
- """This space will set up your domain specific dataset project. It will
35
- create the resources that you need to build a dataset. Those resources include:
36
-
37
- - A dataset repository on the Hub
38
- - Another space to define expert domain and run generation pipelines
39
-
40
- For a complete overview of the project. Check out the README
41
- """
42
- )
43
-
44
- st.page_link(
45
- "pages/πŸ§‘β€πŸŒΎ Domain Data Grower.py",
46
- label="Domain Data Grower",
47
- icon="πŸ§‘β€πŸŒΎ",
48
- )
49
-
50
- ################################################################################
51
- # CONFIGURATION
52
- ################################################################################
53
-
54
- st.subheader("🌾 Project Configuration")
55
-
56
- project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
57
- hub_username = st.text_input("Hub Username", "argilla")
58
- hub_token = st.text_input("Hub Token", type="password")
59
- private_selector = st.checkbox("Private Space", value=False)
60
-
61
- if st.button("πŸ€— Setup Project Resources"):
62
- repo_id = f"{hub_username}/{project_name}"
63
-
64
- setup_dataset_on_hub(
65
- repo_id=repo_id,
66
- hub_token=hub_token,
67
- )
68
-
69
- st.success(
70
- f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}). Hold on the repo_id: {repo_id}, we will need it in the next steps."
71
- )
72
-
73
- space_name = f"{project_name}_config_space"
74
-
75
- duplicate_space_on_hub(
76
- source_repo="argilla/domain-specific-datasets-template",
77
- target_repo=space_name,
78
- hub_token=hub_token,
79
- private=private_selector,
80
- )
81
-
82
- st.success(
83
- f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
84
- )
85
-
86
- argilla_name = f"{project_name}_argilla_space"
87
-
88
- duplicate_space_on_hub(
89
- source_repo="argilla/argilla-template-space",
90
- target_repo=argilla_name,
91
- hub_token=hub_token,
92
- private=private_selector,
93
- )
94
-
95
- st.success(
96
- f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
97
- )
98
-
99
- seconds = 5
100
-
101
- with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
102
- time.sleep(seconds)
103
- add_project_config_to_space_repo(
104
- dataset_repo_id=repo_id,
105
- hub_token=hub_token,
106
- project_name=project_name,
107
- argilla_space_repo_id=f"{hub_username}/{argilla_name}",
108
- project_space_repo_id=f"{hub_username}/{space_name}",
109
- )
110
-
111
- st.subheader("πŸ‘’ Next Steps")
112
-
113
- st.write("Go to you project specific space!")
114
-
115
- st.link_button(
116
- "πŸ§‘β€πŸŒΎ Open Configuration Space",
117
- f"https://huggingface.co/spaces/{hub_username}/{space_name}",
118
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
parent_app/hub.py DELETED
@@ -1,76 +0,0 @@
1
- import json
2
- from tempfile import mktemp
3
-
4
-
5
- from huggingface_hub import duplicate_space, HfApi
6
-
7
-
8
- hf_api = HfApi()
9
-
10
-
11
- def setup_dataset_on_hub(repo_id, hub_token):
12
- # create an empty dataset repo on the hub
13
- hf_api.create_repo(
14
- repo_id=repo_id,
15
- token=hub_token,
16
- repo_type="dataset",
17
- )
18
-
19
- # upload the seed data
20
- hf_api.upload_file(
21
- path_or_fileobj="seed_data.json",
22
- path_in_repo="seed_data.json",
23
- repo_id=repo_id,
24
- repo_type="dataset",
25
- token=hub_token,
26
- )
27
-
28
-
29
- def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False):
30
- duplicate_space(
31
- from_id=source_repo,
32
- to_id=target_repo,
33
- token=hub_token,
34
- private=private,
35
- exist_ok=True,
36
- )
37
-
38
-
39
- def add_project_config_to_space_repo(
40
- dataset_repo_id,
41
- hub_token,
42
- project_name,
43
- argilla_space_repo_id,
44
- project_space_repo_id,
45
- ):
46
- # upload the seed data and readme to the hub
47
-
48
- with open("project_config.json", "w") as f:
49
- json.dump(
50
- {
51
- "project_name": project_name,
52
- "argilla_space_repo_id": argilla_space_repo_id,
53
- "project_space_repo_id": project_space_repo_id,
54
- "dataset_repo_id": dataset_repo_id,
55
- },
56
- f,
57
- )
58
-
59
- hf_api.upload_file(
60
- path_or_fileobj="project_config.json",
61
- path_in_repo="project_config.json",
62
- token=hub_token,
63
- repo_id=project_space_repo_id,
64
- repo_type="space",
65
- )
66
-
67
-
68
- def pull_seed_data_from_repo(repo_id, hub_token):
69
- tempfile_path = mktemp()
70
- # pull the dataset repo from the hub
71
- hf_api.hf_hub_download(
72
- repo_id=repo_id, token=hub_token, repo_type="dataset", filename=tempfile_path
73
- )
74
- return json.load(open(tempfile_path))
75
-
76
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
parent_app/pages/πŸ§‘β€πŸŒΎ Domain Data Grower.py DELETED
@@ -1,15 +0,0 @@
1
- import streamlit as st
2
- import requests
3
-
4
-
5
- readme_location = "https://raw.githubusercontent.com/huggingface/data-is-better-together/51f29e67165d8277d9f9d1e4be60869f4b705a08/domain-specific-datasets/README.md"
6
-
7
-
8
- def open_markdown_file(url):
9
- response = requests.get(url)
10
- return response.text
11
-
12
-
13
- readme = open_markdown_file(readme_location)
14
-
15
- st.markdown(readme)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
parent_app/project_config.json DELETED
@@ -1 +0,0 @@
1
- {"project_name": "farming", "argilla_space_repo_id": "ignacioct/farming_argilla_space", "project_space_repo_id": "ignacioct/farming_config_space", "dataset_repo_id": "ignacioct/farming"}
 
 
parent_app/seed_data.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "domain": "farming",
3
- "perspectives": [
4
- "Family Farming"
5
- ],
6
- "topics": [
7
- "animal welfare"
8
- ],
9
- "examples": [
10
- {
11
- "question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
12
- "answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
13
- }
14
- ],
15
- "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."
16
- }